load("02_20_m5s_forum_lda.RData")

Sys.setenv(TZ='GMT')

require(topicmodels)
forum_text_lda$lda_topic_int <- topics(lda)

# topicmodels::terms(lda, 20)
# table(topics(lda))

topic_labels <- list('1' = 'environment', '2' = 'M5S - online participation', '3' = 'law-making', 
                     '4' = 'law-making', '5' = 'M5S - election and representation',
                     '6' = 'undefined', '7' = 'local institutions', '8' = 'M5S - national politics',
                     '9' = 'M5S - needs and change', '10' = 'rights - personal, family, cultural',
                     '11' = 'macroeconomy', '12' = 'liberalisation - drugs and prostitution', 
                     '13' = 'employment and education', '14' = 'undefined', '15' = 'microeconomy')

forum_text_lda$lda_topic_label <- 
  as.character(topic_labels[as.character(forum_text_lda$lda_topic_int)])

require(dplyr)
user_time_topics <-
  subset(forum_text_lda, lda_topic_label != 'undefined') %>%
  dplyr::group_by(author_unique_id) %>%
  dplyr::summarize(permanence = max(date) - min(date),
                   posts = n(),
                   topics = length(unique(lda_topic_label)))
user_time_topics$cut <- cut(user_time_topics$permanence, breaks = 50, labels = 1:50)
# cor.test(user_time_topics$permanence, user_time_topics$topics)
# summary(lm(topics~permanence+posts, data=user_time_topics))

require(ggplot2)
# ggplot(user_time_topics, aes(x=cut, y=topics)) +
#   geom_boxplot() +
#   geom_smooth(data = user_time_topics, aes(x=as.numeric(cut), y=topics)) +
# scale_x_discrete(limits = as.character(1:50)) +
# scale_y_continuous(breaks = 1:12)

# require(igraph) 
# g_lda_topics <-
#   graph_from_data_frame(forum_text_lda[,c('author_unique_id', 'lda_topic_label')],
#                         directed = TRUE,
#                         vertices = unique(data.frame(id = c(forum_text_lda$author_unique_id, 
#                                                      forum_text_lda$lda_topic_label),
#                                               type = c(rep(TRUE, 50000), rep(FALSE, 50000)),
#                                               label = c(forum_text_lda$author_unique_id, 
#                                                         forum_text_lda$lda_topic_label),
#                                               stringsAsFactors = F)))
# g_lda_topics <- g_lda_topics - V(g_lda_topics)[V(g_lda_topics)$label == 'undefined']
# E(g_lda_topics)$weight <- 1
# g_lda_topics_proj <- bipartite.projection(g_lda_topics, which='false')

ggplot(user_time_topics, aes(x=cut, y=topics)) +
  geom_boxplot() +
  geom_smooth(data = user_time_topics, aes(x=as.numeric(cut), y=topics)) +
  scale_x_discrete(limits = as.character(1:50), 
                   breaks = as.character(seq(from = 0, to = 50, by=10)), 
                   labels = as.character(seq(from = 0, to = 50, by=10)*43)) +
  scale_y_continuous(breaks = 1:12) + 
  labs(x = 'permanence on the Forum (days)', y = 'number of topics discussed')
  